rm(list = ls())
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.4
library(stringr)
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.0.3
library(rgdal)
## Warning: package 'rgdal' was built under R version 4.0.4
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.0.3
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/KELPTMGD1561/Documents/R/win-library/4.0/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/KELPTMGD1561/Documents/R/win-library/4.0/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/KELPTMGD1561/Documents/R/win-library/4.0/rgdal/proj
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.4
library(htmlwidgets)
## Warning: package 'htmlwidgets' was built under R version 4.0.3
library(readstata13) #for reading the stata files
library(dplyr) #data manipulation
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)##foreign
library(htmlTable)##html tables
## Warning: package 'htmlTable' was built under R version 4.0.3
library(magrittr) #manipulate
## Warning: package 'magrittr' was built under R version 4.0.3
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(loose.rock) #for proper changing of the string cases
## Warning: package 'loose.rock' was built under R version 4.0.5
library(leaflet)
library(sf)
## Warning: package 'sf' was built under R version 4.0.3
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
Read the data form the .TAB
using the DPLYR package do the following t steps
Select the required names of the variables
Rename the variables using names that make sense
We rename using the Rename function provided by the dplyr package.
NB the new name comes before the old name. Vice versa won’t work
nassepvUpdate <- read.delim("Rladies_Raw_data/rawDataServer/NassepSixUpdate11.tab")
#head(NassepSixUpdate11)
nass_0<-nassepvUpdate %>% select(clu00,clu01,clu02,clu03,clu04,clu05,clu06,clu07,clu08,
clu09,clu10,gps_clus__Latitude,gps_clus__Longitude,
gps_clus__Accuracy,date_int,v1,v2, qw,hs1,perhs1,
hs2,pp_1, perhs2,hs3,perhs3, c0_0,c1_1,c2_2,
c3_3,n1,
c1a,c1b,cc2,dist_ch, cc3,time_ch,c4a,
c4b,dist_tow,c5a,c5b__1,c5b__2,c5b__3,
c5b__4,c5b__5,c5b__6, c5b__7, c6,c7,c8,
c9,c10, c11a,per1,c11b,per2,
c11c,per3,c12,c13,c13b,chief,chieftelphone,
asschief,asschieftelphone,villageelder,
villageeldertelephone,date_end)
### **Rename to make sense**
nass1<-nass_0 %>% rename(ClusterNumber=clu00,County=clu01,Subcounty=clu02,Division=clu03,
Location=clu04,SubLocation=clu05,GEOCODE=clu06,EAName_1=clu07,EAtype_Residence=clu08,
EAStatus=clu09,frame_component=clu10,clu_gps_lat=gps_clus__Latitude,clu_gps_long=gps_clus__Longitude,
clu_gps_accuracy=gps_clus__Accuracy,date_start_interview=date_int,
Total_Count_homesteads=v1,Total_households_Homesteads=v2,
cluster_req_seg=qw,households_Seg_1=hs1,Percent_seg_1=perhs1,
households_Seg_2=hs2,Pop_total_minus_pop_1=pp_1,
Percent_seg_2=perhs2,hh_Segmet3=hs3,Percent_seg_3=perhs3,
CumPercenths_0=c0_0,CumPerc_1=c1_1,CumPerc_2=c2_2,
CumPer_3=c3_3,Last_2_dig_cluster=n1,
Cluster_Listed=c1a,Why_not_listed=c1b,distance_County_HQ_to_cluster=cc2,dist_Km_Metres=dist_ch,
Time_County_HQ_to_cluster=cc3,time_hours_minutes=time_ch,appr_town_spend_night=c4a,
Dist_townorfacility_cluster=c4b,distance_in_KM_Metres=dist_tow,clu_req_security=c5a,
causes_insecurity_1=c5b__1,causes_insecurity_2=c5b__2,causes_insecurity_3=c5b__3,
causes_insecurity_4=c5b__4,causes_insecurity_5=c5b__5,causes_insecurity_6=c5b__6,
causes_insecurity_7=c5b__7, mode_transport=c6,terrain_cluster=c7,size_cluster=c8,
households_within_cluster=c9,current_settlement_cluster=c10,
native_languages_1=c11a,Per_1_lang=per1,native_languages_2=c11b,Per_2_lang=per2,
native_languages_3=c11c,Per_3_lang=per3,
main_econ_act=c12,EAName=c13,name_EA_asknown=c13b,name_chief=chief,telephone_chief=chieftelphone,
name_Assist_Chief=asschief,telephone_Ast_chief=asschieftelphone,Village_Elder=villageelder,
telephone_village_elder=villageeldertelephone,Date_End=date_end)
head(nass1)
## ClusterNumber County Subcounty Division Location SubLocation GEOCODE
## 1 999999 nairobi ##N/A## ##N/A## ##N/A## ##N/A## -999999999
## 2 999998 kiambu ##N/A## ##N/A## ##N/A## ##N/A## -999999999
## EAName_1 EAtype_Residence EAStatus frame_component clu_gps_lat clu_gps_long
## 1 herufi 2 -999999999 -999999999 -1.303507 36.81034
## 2 mucatha 1 -999999999 -999999999 -1.303354 36.81041
## clu_gps_accuracy date_start_interview Total_Count_homesteads
## 1 4.900 2021-04-21T14:04:49 2
## 2 16.413 2021-04-22T09:39:59 2
## Total_households_Homesteads cluster_req_seg households_Seg_1 Percent_seg_1
## 1 300 1 100 33
## 2 200 1 101 50
## households_Seg_2 Pop_total_minus_pop_1 Percent_seg_2 hh_Segmet3 Percent_seg_3
## 1 120 200 40 80 27
## 2 99 99 50 NA -999999999
## CumPercenths_0 CumPerc_1 CumPerc_2 CumPer_3 Last_2_dig_cluster
## 1 0 33 73 100 99
## 2 0 50 100 -999999999 98
## Cluster_Listed Why_not_listed distance_County_HQ_to_cluster dist_Km_Metres
## 1 -999999999 NA -999999999 NA
## 2 -999999999 NA -999999999 NA
## Time_County_HQ_to_cluster time_hours_minutes appr_town_spend_night
## 1 -999999999 NA ##N/A##
## 2 -999999999 NA ##N/A##
## Dist_townorfacility_cluster distance_in_KM_Metres clu_req_security
## 1 -999999999 NA -999999999
## 2 -999999999 NA -999999999
## causes_insecurity_1 causes_insecurity_2 causes_insecurity_3
## 1 NA NA NA
## 2 NA NA NA
## causes_insecurity_4 causes_insecurity_5 causes_insecurity_6
## 1 NA NA NA
## 2 NA NA NA
## causes_insecurity_7 mode_transport terrain_cluster size_cluster
## 1 NA -999999999 -999999999 -999999999
## 2 NA -999999999 -999999999 -999999999
## households_within_cluster current_settlement_cluster native_languages_1
## 1 -999999999 -999999999 ##N/A##
## 2 -999999999 -999999999 ##N/A##
## Per_1_lang native_languages_2 Per_2_lang native_languages_3 Per_3_lang
## 1 -999999999 NA NA NA NA
## 2 -999999999 NA NA NA NA
## main_econ_act EAName name_EA_asknown name_chief telephone_chief
## 1 ##N/A## -999999999 NA ##N/A## ##N/A##
## 2 ##N/A## -999999999 NA ##N/A## ##N/A##
## name_Assist_Chief telephone_Ast_chief Village_Elder telephone_village_elder
## 1 ##N/A## ##N/A## ##N/A## ##N/A##
## 2 ##N/A## ##N/A## ##N/A## ##N/A##
## Date_End
## 1 2021-04-21T14:16:12
## 2 2021-04-22T09:44:52
We rewrite for easy use. Use .tab my favorite due to space and can be used by different software
write.table(nass1, file="Rladies_Final_data/samplers/cluster.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
###structures
structure<- read.delim("Rladies_Raw_data/rawDataServer/structure.tab")
#head(structure)
nassepStructure<-merge(nassepvUpdate,structure, by="interview__id")
#head(nassepStructure)
clust2<-nassepStructure %>% select(clu00,structure__id,s01,
s03_gps__Latitude, s03_gps__Longitude, s03_gps__Accuracy ,
s03_gps__Altitude, s03_gps__Timestamp, s03_gps_1,
s04, s05, s06,s06b)
#head(clust2) ##clean structure
clust3<-clust2 %>% rename(Cluster_number=clu00,structure_number=structure__id,name_structure=s01,
gps_str_latitude=s03_gps__Latitude, gps_str_longitude=s03_gps__Longitude,
gps_str_Accuracy=s03_gps__Accuracy,gps_str_Altitude=s03_gps__Altitude,
gps_str_Time=s03_gps__Timestamp,PointGPS=s03_gps_1,
comments_str=s04,str_residential=s05, comment_purpose=s06,
Feauture_type=s06b)
head(clust3)
## Cluster_number structure_number name_structure gps_str_latitude
## 1 999999 1 mwenda apartments -1.303349
## 2 999999 2 ak1 -1.303336
## 3 999998 1 a1 -1.303351
## 4 999998 2 a5 -1.303348
## gps_str_longitude gps_str_Accuracy gps_str_Altitude gps_str_Time
## 1 36.81041 12.108 1739.3 2021-04-21T11:08:18
## 2 36.81040 13.511 1733.0 2021-04-21T11:13:58
## 3 36.81041 11.939 1733.7 2021-04-22T06:42:04
## 4 36.81041 11.908 1733.7 2021-04-22T06:44:02
## PointGPS comments_str str_residential
## 1 40.1300658751093,-3.21277709768003 ##N/A## 1
## 2 40.1327927923755,-3.21231883289518 ##N/A## 1
## 3 40.1322258095776,-3.21372058335391 ##N/A## 1
## 4 40.1309568480774,-3.21452928466577 ##N/A## 1
## comment_purpose Feauture_type
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
write.table(clust3, file="Rladies_Final_data/samplers/structure.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
household <- read.delim("Rladies_Raw_data/rawDataServer/household.tab")
nassepHouseHold<-merge(nassepvUpdate,household, by="interview__id")
cleanHouseUnsort<-subset(nassepHouseHold, select = c( clu00,clusterpart__id,structure__id,household__id,
h07,h08,h09,h10,h12,h13,h14,h15,h16,h17,h18,h19,
h20,h21,h23,h24,h24b,h25,h26b,h28))
house2<-cleanHouseUnsort %>%
mutate(str_add=case_when(
clusterpart__id==1 ~0,
clusterpart__id==2 ~40,
clusterpart__id==3 ~80,
clusterpart__id==4 ~120,
clusterpart__id==5 ~160,
clusterpart__id==6 ~200
)) %>%
group_by(clu00) %>%
mutate(s02=structure__id+str_add)
housfin<-subset(house2, select = c( clu00,s02,h07,h08,h09,h10,h12,h13,h14,h15,h16,h17,h18,h19,
h20,h21,h23,h24,h24b,h25,h26b,h28 ))
#View(housfin) ##clean household
hous3<-housfin %>% rename(cluster_number=clu00,Structure_number=s02,serial_num_HU=h07,
House_number=h08,HU_occupied=h09,reason_unoccupied=h10,
Name_HH_Head=h12,sex_head=h13,occupation=h14,Total_people=h15,
below_4_male=h16,below_4_female=h17,Five_seventeen_male=h18,
Five_seventeen_female=h19,eighteen_Above_male=h20,
eighteen_Above_female=h21,operate_business=h23,
Telephone_head=h24,Telephone_head_other=h24b,
Telephone_other_member=h25,relationship=h26b,comments=h28)
head(hous3)
## # A tibble: 5 x 22
## # Groups: cluster_number [2]
## cluster_number Structure_number serial_num_HU House_number HU_occupied
## <int> <dbl> <int> <chr> <int>
## 1 999999 1 1 a1 1
## 2 999999 1 2 a2 1
## 3 999999 2 1 a2 1
## 4 999998 1 1 a2 1
## 5 999998 2 1 a2 3
## # ... with 17 more variables: reason_unoccupied <lgl>, Name_HH_Head <chr>,
## # sex_head <int>, occupation <chr>, Total_people <int>, below_4_male <int>,
## # below_4_female <int>, Five_seventeen_male <int>,
## # Five_seventeen_female <int>, eighteen_Above_male <int>,
## # eighteen_Above_female <int>, operate_business <int>, Telephone_head <chr>,
## # Telephone_head_other <chr>, Telephone_other_member <chr>,
## # relationship <int>, comments <chr>
write.table(hous3, file="Rladies_Final_data/samplers/household.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
####for catographers cleaning points
#for feautures
r4 <- read.delim("Rladies_Raw_data/rawDataServer/r4.tab")
#head(r4)
#attach(r4)
#View(r4)
r41<-r4 %>% select(r4__id,ft,Clu_feat,sp,name_feature,gps_f__Latitude, gps_f__Longitude,gps_f__Accuracy,gps_f__Timestamp)
head(r41)
## r4__id ft Clu_feat sp name_feature gps_f__Latitude gps_f__Longitude
## 1 1 1 2 NA river mwenda -1.303448 36.81061
## 2 2 2 6 NA nairobi rail -1.303512 36.81065
## 3 1 1 9 NA boni forest -1.303357 36.81041
## gps_f__Accuracy gps_f__Timestamp
## 1 5.100 2021-04-21T11:21:13
## 2 3.600 2021-04-21T11:22:19
## 3 19.717 2021-04-22T06:39:00
##codes here exctract the data to be send to catographers for verification mapping and segmenting
s1 <- read.delim("Rladies_Raw_data/rawDataServer/mapperHSMF.tab")
##households/structure during quick count
d3<-s1 %>% select(clu00,s11)
#View(d3)
hom_str<-d3 %>% transmute(cluster_number = clu00,latitude = str_extract_all(s11,'-\\d\\.\\d+'),longitude = str_extract_all(s11, '\\d{2}\\.\\d+')) %>%
unnest(cols = everything())
head(hom_str)
## # A tibble: 6 x 3
## cluster_number latitude longitude
## <int> <chr> <chr>
## 1 999999 -3.21234578965294 40.133440772716
## 2 999999 -3.21479154251722 40.1335737423015
## 3 999999 -3.21644002320421 40.132856469716
## 4 999999 -3.21511583401042 40.131692593445
## 5 999999 -3.21627787768103 40.1308941201894
## 6 999999 -3.21385920389999 40.130393382724
write.table(hom_str, file="Rladies_Final_data/cartography/Quick_Count_Structure.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
d4<-s1 %>% select(clu00,segselect,p3)
#d4
d4_1<-d4 %>% filter(!is.na(segselect))
#d4_1
seg_point<-d4_1 %>% transmute(cluster_number = clu00,num_seg=segselect, latitude = str_extract_all(p3,'-\\d\\.\\d+'),longitude = str_extract_all(p3, '\\d{2}\\.\\d+')) %>%
unnest(cols = everything())
head(seg_point)
## # A tibble: 6 x 4
## cluster_number num_seg latitude longitude
## <int> <int> <chr> <chr>
## 1 999999 2 -3.21732748931697 40.126832186803
## 2 999999 2 -3.21685917444566 40.1272103259819
## 3 999999 2 -3.21661750025944 40.1275055156961
## 4 999999 2 -3.21605752326633 40.1277948016159
## 5 999999 2 -3.21546217896797 40.1278302243816
## 6 999999 2 -3.21494935736305 40.127777090233
write.table(seg_point, file="Rladies_Final_data/cartography/SEG_points.tab",
na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
d6 <- s1 %>% select(clu00,d1)
seg_1 <- d6 %>%
transmute(cluster_number = clu00,
latitude = str_extract_all(d1,'-\\d\\.\\d+'),
longitude = str_extract_all(d1, '\\d{2}\\.\\d+')) %>%
unnest(cols = everything())
write.table(seg_1, file="Rladies_Final_data/cartography/segment_1_centroids.tab",
na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
d7 <- s1 %>%
select(clu00,d2)
seg_2 <- d7 %>%
transmute(cluster_number = clu00,
latitude = str_extract_all(d2,'-\\d\\.\\d+'),
longitude = str_extract_all(d2, '\\d{2}\\.\\d+')) %>%
unnest(cols = everything())
write.table(seg_2, file="Rladies_Final_data/cartography/segment_2_centroids.tab",
na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
d8 <- s1 %>% select(clu00,d3)
seg_3 <- d8 %>% transmute(cluster_number = clu00,
latitude = str_extract_all(d3,'-\\d\\.\\d+'),
longitude = str_extract_all(d3, '\\d{2}\\.\\d+')) %>%
unnest(cols = everything())
write.table(seg_3, file="Rladies_Final_data/cartography/segment_3_centroids.tab",
na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
hm1 <- read.delim("Rladies_Raw_data/rawDataServer/hm1.tab")
m1<-hm1 %>% select(interview__id,hm1__id) %>% as.data.frame()
j1<-nassepvUpdate %>% select(interview__id,clu00,v1,v2)%>% as.data.frame()
n1<-m1 %>% left_join(j1, by="interview__id")
n2<-n1 %>% select(clu00,hm1__id,v1,v2) %>%
rename(cluster_number=clu00, str_id=hm1__id, total_str=v1, tot_hous=v2)%>%as.data.frame()
kk1<-hom_str %>% group_by(cluster_number) %>% mutate(str_id = row_number())%>%as.data.frame()
#left joining when the MAPPER is super data
n3<-kk1 %>% left_join(n2, by=c("cluster_number", "str_id"))
n31<-n3 %>% select(cluster_number,str_id,total_str,tot_hous,longitude,latitude)%>%
rename(cluster_number=cluster_number,str_id_MAPPER=str_id, total_str_LISTER=total_str,
tot_hous_LISTER=tot_hous, longitude=longitude,latitude=latitude)
write.table(n31, file="Rladies_Final_data/Quality/aMAPPER.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
#left joining when the LISTER is super data
n4<-n2 %>% left_join(kk1, by=c("cluster_number", "str_id"))
n41<-n4 %>% select(cluster_number,str_id,total_str,tot_hous,longitude,latitude)%>%
rename(cluster_number=cluster_number,str_id_LISTER=str_id, total_str_LISTER=total_str,
tot_hous_LISTER=tot_hous, longitude=longitude,latitude=latitude)
write.table(n41, file="Rladies_Final_data/Quality/aLISTER.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
#full join MAPPER and LISTER to get differences
d<-full_join(n2,kk1, by=c("cluster_number","str_id"))
write.table(d, file="Rladies_Final_data/Quality/bothLISTER_MAPPER.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
q1<-hom_str %>% group_by(cluster_number) %>% summarise(total_structure_MAPPER=n())%>% as.data.frame()
#Total listed by the LISTER during quick count
p1<-j1 %>% select(clu00, v1) %>% rename(cluster_number=clu00,total_structure_LISTER=v1)%>%as.data.frame()
c <- full_join(p1,q1, by="cluster_number")
write.table(c, file="Rladies_Final_data/Quality/LIST_MAP_AGGREGATE.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
###—————————————————–cluster update starts here
clu_main <- read.delim("Rladies_Raw_data/raw_data_for_preload/cluster_1.tab")
hh_main <- read.delim("Rladies_Raw_data/raw_data_for_preload/household_1.tab")
str_main <- read.delim("Rladies_Raw_data/raw_data_for_preload/structure_1.tab")
hh_main2 <- hh_main %>%
filter(!is.na(h07)) #filter any missing household number
hh_tot <- hh_main2 %>%
group_by(cluster_number) %>%
summarise(totHh=n()) %>%
arrange(totHh)
#check if there are repeated serialization withing households in the structure
hh_uniq <- hh_main2 %>%
group_by(cluster_number,Structure_number,serial_num_HU)%>%
filter(n()>1)##find any duplicates in the dataset after remove h07
hh_uniq #duplicated households
## # A tibble: 0 x 23
## # Groups: cluster_number, Structure_number, serial_num_HU [0]
## # ... with 23 variables: cluster_number <int>, Structure_number <int>,
## # serial_num_HU <int>, House_number <chr>, HU_occupied <int>, h07 <int>,
## # reason_unoccupied <lgl>, Name_HH_Head <chr>, sex_head <int>,
## # occupation <chr>, Total_people <int>, below_4_male <int>,
## # below_4_female <int>, Five_seventeen_male <int>,
## # Five_seventeen_female <int>, eighteen_Above_male <int>,
## # eighteen_Above_female <int>, operate_business <int>, Telephone_head <chr>,
## # Telephone_head_other <chr>, Telephone_other_member <chr>,
## # relationship <int>, comments <chr>
#Get total sum of structutes in a given cluster
str_tot <- str_main %>%
group_by(cluster_number)%>%
summarise(totStr=n()) %>%
arrange(totStr)
str_uniq<-str_main %>%
group_by(cluster_number,Structure_number)%>%
filter(n()>1)##find any duplicates in the structure
str_uniq##filter any duplicates structues
## # A tibble: 0 x 13
## # Groups: cluster_number, Structure_number [0]
## # ... with 13 variables: cluster_number <int>, Structure_number <int>,
## # name_structure <chr>, gps_str_latitude <dbl>, gps_str_longitude <dbl>,
## # gps_str_Accuracy <dbl>, gps_str_Altitude <dbl>, gps_str_Time <chr>,
## # PointGPS <chr>, comments_str <chr>, str_residential <int>,
## # comment_purpose <lgl>, Feauture_type <lgl>
#joining the structures and the households fo coding
hh_str_TT <- str_tot %>%
left_join(hh_tot, by="cluster_number")
#head(hh_str_TT)
##select the cluster number for the selected structures
##create a new str filtered depending on number of MAIN str in a cluster will be used for analysis
#str_sub <- str_tot %>%
# filter(totStr>50) #greater than 50 structures in the cluster
#head(str_sub)
#######--------------------------------clusters number selected for use
cluster_Numbers <- str_tot$cluster_number
###--------------------------------------------------------END CLUSTER NUMBERS
#--------------------------------------------------------start creating clusters
##
cluster <- clu_main %>%
filter(cluster_number %in% cluster_Numbers)
cluster$interview__id=cluster$cluster_number
str <- str_main %>%
filter(cluster_number %in% cluster_Numbers)
str$interview__id=str$cluster_number
hh <- hh_main2 %>%
filter(cluster_number %in% cluster_Numbers)
hh$interview__id=hh$cluster_number
##create the nassepv tab
#create the interview__id
###write the first nassep VI tab
str_gr <- str %>%
select(cluster_number,gps_str_latitude,gps_str_longitude,gps_str_Altitude)
str_gr2 <- aggregate(str_gr, list(str_gr$cluster_number), FUN=head, 1)
str_gr3 <- str_gr2 %>%
select(cluster_number,gps_str_latitude,gps_str_longitude,gps_str_Altitude)
g1 <- str_gr3 %>%
rename(gps_clus__Latitude = gps_str_latitude, gps_clus__Longitude = gps_str_longitude, gps_clus__Altitude = gps_str_Altitude)%>%
as.data.frame()
#g1
#head(g1)
cluster1 <- cluster %>%as.data.frame()
cluster1$clu00 <- as.integer(cluster1$cluster_number)
clu <- cluster1 %>% left_join(g1, by="cluster_number")
head(clu)
## cluster_number County Subcounty Division Location SubLocation GEOCODE
## 1 999999 nairobi ##N/A## ##N/A## ##N/A## ##N/A## -999999999
## 2 999998 kiambu ##N/A## ##N/A## ##N/A## ##N/A## -999999999
## EAName_1 EAtype_Residence EAStatus frame_component clu_gps_lat clu_gps_long
## 1 herufi 2 -999999999 -999999999 -1.303507 36.81034
## 2 mucatha 1 -999999999 -999999999 -1.303354 36.81041
## clu_gps_accuracy date_start_interview Total_Count_homesteads
## 1 4.900 2021-04-21T14:04:49 2
## 2 16.413 2021-04-22T09:39:59 2
## Total_households_Homesteads cluster_req_seg households_Seg_1 Percent_seg_1
## 1 300 1 100 33
## 2 200 1 101 50
## households_Seg_2 Pop_total_minus_pop_1 Percent_seg_2 hh_Segmet3 Percent_seg_3
## 1 120 200 40 80 27
## 2 99 99 50 NA -999999999
## CumPercenths_0 CumPerc_1 CumPerc_2 CumPer_3 Last_2_dig_cluster
## 1 0 33 73 100 99
## 2 0 50 100 -999999999 98
## Cluster_Listed Why_not_listed distance_County_HQ_to_cluster dist_Km_Metres
## 1 -999999999 NA -999999999 NA
## 2 -999999999 NA -999999999 NA
## Time_County_HQ_to_cluster time_hours_minutes appr_town_spend_night
## 1 -999999999 NA ##N/A##
## 2 -999999999 NA ##N/A##
## Dist_townorfacility_cluster distance_in_KM_Metres clu_req_security
## 1 -999999999 NA -999999999
## 2 -999999999 NA -999999999
## causes_insecurity_1 causes_insecurity_2 causes_insecurity_3
## 1 NA NA NA
## 2 NA NA NA
## causes_insecurity_4 causes_insecurity_5 causes_insecurity_6
## 1 NA NA NA
## 2 NA NA NA
## causes_insecurity_7 mode_transport terrain_cluster size_cluster
## 1 NA -999999999 -999999999 -999999999
## 2 NA -999999999 -999999999 -999999999
## households_within_cluster current_settlement_cluster native_languages_1
## 1 -999999999 -999999999 ##N/A##
## 2 -999999999 -999999999 ##N/A##
## Per_1_lang native_languages_2 Per_2_lang native_languages_3 Per_3_lang
## 1 -999999999 NA NA NA NA
## 2 -999999999 NA NA NA NA
## main_econ_act EAName name_EA_asknown name_chief telephone_chief
## 1 ##N/A## -999999999 NA ##N/A## ##N/A##
## 2 ##N/A## -999999999 NA ##N/A## ##N/A##
## name_Assist_Chief telephone_Ast_chief Village_Elder telephone_village_elder
## 1 ##N/A## ##N/A## ##N/A## ##N/A##
## 2 ##N/A## ##N/A## ##N/A## ##N/A##
## Date_End interview__id clu00 gps_clus__Latitude
## 1 2021-04-21T14:16:12 999999 999999 -1.303349
## 2 2021-04-22T09:44:52 999998 999998 -1.303351
## gps_clus__Longitude gps_clus__Altitude
## 1 36.81041 1739.3
## 2 36.81041 1733.7
write.table(clu, file="Rladies_Final_data/created_preload_files/nassepvUpdate.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
attach(str)
str2 <- str %>% select(interview__id,Structure_number)
data_nas <- str2 %>%
group_by(interview__id) %>%
mutate(clusterpart__id = ifelse(Structure_number %in% 1:40,1,
ifelse(Structure_number%in% 41:80,2,
ifelse(Structure_number%in% 81:120,3,
ifelse(Structure_number%in% 121:160,4,
ifelse(Structure_number%in% 161:200,5,6)))))) %>%
ungroup() %>%
group_by(interview__id,clusterpart__id) %>%
mutate(val=1:n()-1,nid = cur_group_id()) %>% ungroup() %>%
select(-clusterpart__id) %>% mutate(interview__id=paste0(interview__id,'.',nid),val=paste0('Structure_number','__',val)) %>% select(-nid) %>%
pivot_wider(names_from = c(val),values_from = Structure_number) %>%
mutate(interview__id=gsub("\\..*","", interview__id)) %>% group_by(interview__id) %>%
mutate(clusterpart__id=1:n()) #%>% select(order(colnames(.)))
head(data_nas)
## # A tibble: 2 x 4
## # Groups: interview__id [2]
## interview__id Structure_number__0 Structure_number__1 clusterpart__id
## <chr> <int> <int> <int>
## 1 999999 1 2 1
## 2 999998 1 2 1
###write the cluster tab
write.table(data_nas, file="Rladies_Final_data/created_preload_files/clusterpart.tab", na = "##NA##",
row.names = F, col.names = T, quote = F, sep = '\t')
############################### Creating the structure id based on cluster__id and interview__id from cluster tab
d3<-data_nas %>%
pivot_longer(cols = starts_with('Structure_number'), values_to = 'Structure_number') %>%
group_by(interview__id, clusterpart__id) %>%
mutate(structure__id = row_number())
d4 <- d3 %>%
filter(!is.na(Structure_number)) %>%
select(-name)
##manipulate to create the household tab
#check if there is any missing h07 household number and drop it
hh1 <- hh %>% drop_na(h07) #drop missing household numbers
hh2 <- hh1 %>% group_by(cluster_number,h07) %>%filter(n() > 1)#no duplicated households in a cluster
hh3 <- hh2 %>% group_by(cluster_number,Structure_number, serial_num_HU) %>%filter(n() > 1)#no duplicated households in a structure
hh4 <- subset(hh3, below_4_male+below_4_female+Five_seventeen_male+Five_seventeen_female+eighteen_Above_male+eighteen_Above_female != Total_people) ##sum of members in different households age category sum to the total in the household
#Exctract the interview id and cluster part id from the created cluster data frame
d4$interview__id <- as.integer(d4$interview__id)
hh5 <- hh1 %>% left_join(d4, by=c("interview__id","Structure_number")) #%>% arrange(interview__id, clusterpart__id,s02)
hh6 <- hh5 %>%
group_by(interview__id,structure__id) %>%
mutate(household__id= row_number(structure__id))
hhf <- hh6 %>%
select(interview__id, clusterpart__id, structure__id, household__id,Structure_number, serial_num_HU, h07)
hhff <- hhf %>% select(-Structure_number)
head(hhff)
## # A tibble: 5 x 6
## # Groups: interview__id, structure__id [4]
## interview__id clusterpart__id structure__id household__id serial_num_HU h07
## <int> <int> <int> <int> <int> <int>
## 1 999999 1 1 1 1 1
## 2 999999 1 1 2 2 2
## 3 999999 1 2 1 1 3
## 4 999998 1 1 1 1 1
## 5 999998 1 2 1 1 2
##############-----------------------------final household tab
write.table(hhff, file="Rladies_Final_data/created_preload_files/household.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
##############-----------------------------final household tab
###########################------------------------------------creating structure sheet
############-------------------------------------------------------------------------------------------------str
hh55 <- hh5 %>%
select(interview__id, Structure_number) %>%
group_by(interview__id,Structure_number)%>%
summarise() %>%
as.data.frame()
## `summarise()` has grouped output by 'interview__id'. You can override using the `.groups` argument.
d44 <- d4 %>%
as.data.frame() %>%
select(interview__id, Structure_number)
dz1 <- d44 %>%
anti_join(hh55, by=c("interview__id", "Structure_number"))
dz2 <- dz1 %>%
left_join(d4, by=c("interview__id","Structure_number"))
dz3 <- merge(hh5,dz2,by=c("interview__id","clusterpart__id","structure__id","Structure_number"))
dz4 <- dplyr::bind_rows(dz2, hh5)
################--------------------------------------------------------------------------------------------str
hhg <- dz4 %>%
select(interview__id ,clusterpart__id,structure__id,Structure_number, serial_num_HU,h07)
str2 <- hhg %>%
mutate(serial_num_HU = serial_num_HU - 1) %>%
pivot_wider(names_from = serial_num_HU,values_from = h07,names_prefix = "h06b__")
##merge with some selected variables from str
str3 <- str %>%
select(interview__id,Structure_number,gps_str_latitude,gps_str_longitude,gps_str_Altitude)
final.str <- str2 %>%
left_join(str3, by=c("interview__id","Structure_number"))
str22 <- final.str # %>%
# select(-h06b__NA)
str23 <- str22 %>%
# rename(gps__Latitude=s04,gps__Longitude=s05,gps__Altitude=s06)%>%
as.data.frame()
str24 <- str23[order(str23$interview__id,str23$clusterpart__id,str23$structure__id,str23$Structure_number),]#,clusterpart__id,structure__id,s02
head(str24)
## interview__id clusterpart__id structure__id Structure_number h06b__0 h06b__1
## 3 999998 1 1 1 1 NA
## 4 999998 1 2 2 2 NA
## 1 999999 1 1 1 1 2
## 2 999999 1 2 2 3 NA
## gps_str_latitude gps_str_longitude gps_str_Altitude
## 3 -1.303351 36.81041 1733.7
## 4 -1.303348 36.81041 1733.7
## 1 -1.303349 36.81041 1739.3
## 2 -1.303336 36.81040 1733.0
write.table(str24, file="Rladies_Final_data/created_preload_files/structure.tab", na = "",
row.names = F, col.names = T, quote = F, sep = '\t')
########################-------------------------------------final structure tab
kenya <- readOGR("Rladies_Raw_data/demoShp", "Makueni_District")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded ellps Clarke 1880 (RGS) in Proj4 definition: +proj=longlat
## +a=6378249.145 +rf=293.465 +no_defs
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS
## = dumpSRS, : Discarded datum Arc_1960 in Proj4 definition: +proj=longlat
## +a=6378249.145 +rf=293.465 +no_defs
## Warning in showSRID(wkt2, "PROJ"): Discarded ellps Clarke 1880 (RGS) in Proj4
## definition: +proj=longlat +a=6378249.145 +rf=293.465 +no_defs +type=crs
## Warning in showSRID(wkt2, "PROJ"): Discarded datum Arc 1960 in Proj4 definition
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\KELPTMGD1561\OneDrive - Norwegian Refugee Council\Scripts\R Ladies meetups\R-Ladies Nairobi github materials\R for official statistics\Rladies_Raw_data\demoShp", layer: "Makueni_District"
## with 54 features
## It has 30 fields
bins <- c(1, 500, 1000, 2500, 5000, 10000, 100000, 1000000, 3000000) #here we set the binning manually and store it as a vector
pal <- colorBin("RdYlBu", domain = kenya$SUM_Househ, bins = bins) #leaflet function to map data values to a color pallete. Colorbin returns a function, not a value.
labels <- paste(kenya$SLNAME, round(kenya$SUM_Househ, digits = 1))%>%
lapply(htmltools::HTML)
m <- leaflet() %>%
leaflet::addTiles() %>% # Add default OpenStreetMap map tiles
leaflet::addPolygons(data = kenya,
weight = 1, #polygon line weight
smoothFactor = 1, #smoothFactor addition smooths out the shapefile polylines to make it load faster. Values < 1 adds detail while values > 1 take detail away.
color = "white", #line color
fillOpacity = 0.8, #polygon opacity
fillColor = pal(kenya$SUM_Househ), #polygon color
label = labels
) %>%
leaflet::addLegend(pal = pal, #adds legend
values = kenya$SUM_Househ,
opacity = 0.7, #legend opacity
position = "topright") #legend position
m
kenya.polys <- st_read("migori/Migori_District.shp")
## Reading layer `Migori_District' from data source `C:\Users\KELPTMGD1561\OneDrive - Norwegian Refugee Council\Scripts\R Ladies meetups\R-Ladies Nairobi github materials\R for official statistics\migori\Migori_District.shp' using driver `ESRI Shapefile'
## Simple feature collection with 164 features and 30 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 33.94061 ymin: -1.386303 xmax: 34.73015 ymax: -0.6478643
## geographic CRS: Arc 1960
kenya.points <- read.delim("Rladies_Raw_data/rawDataServer/randomPointsMigori.tab", header = TRUE, stringsAsFactors = FALSE, quote = "", sep = "\t")
kenya.points <- st_as_sf(kenya.points, coords = c("longitude", "latitude"), crs = 4326)
leaflet() %>%
addProviderTiles(providers$Esri.WorldImagery,
options = providerTileOptions(noWrap = TRUE)) %>%
addCircles(data = kenya.points, color = "red") %>%
addPolygons(data = kenya.polys,
weight = 1)
## Warning: sf layer has inconsistent datum (+proj=longlat +a=6378249.145 +rf=293.465 +no_defs).
## Need '+proj=longlat +datum=WGS84'